library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(here)
## here() starts at C:/Users/Howard/Documents/GitHub/CodeClan/pda_dirty_data
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(stringr)
library(assertr)
library(testthat)
##
## Attaching package: 'testthat'
## The following object is masked from 'package:dplyr':
##
## matches
## The following object is masked from 'package:purrr':
##
## is_null
## The following objects are masked from 'package:readr':
##
## edition_get, local_edition
## The following object is masked from 'package:tidyr':
##
## matches
library(readxl)
1 MVP 1.1 Task 1 - Decathlon Data This data is contained in the .rds file decathlon.rds. You’ll need to use read_rds() from readr to open it.
source(here::here("R_scripts/read_rds_data.R"))
decathlon_data
## 1 Who had the longest long jump seen in the data?
decathlon_data %>%
filter(long_jump == max(long_jump))
## 2 What was the average 100m time in each competition?
decathlon_data %>%
group_by(competition) %>%
summarise(mean(x100m))
decathlon_data %>%
filter(points== max(points))
## 4 What was the shot-put scores for the top three competitors in each competition?
rbind(decathlon_data %>%
filter(competition == "olympicg") %>%
select(competition,competitor,shot_put) %>%
group_by(competition, competitor) %>%
arrange(desc(shot_put)) %>%
head(3),
decathlon_data %>%
filter(competition == "decastar") %>%
select(competition,competitor,shot_put) %>%
group_by(competition, competitor) %>%
arrange(desc(shot_put)) %>%
head(3))
rbind(decathlon_data %>%
filter(x400m < 50) %>%
summarise(round(mean(points),2 )) %>%
distinct() %>%
paste( " average points where 400m < 50 secs"),
decathlon_data %>%
filter(x400m >= 50) %>%
summarise( round(mean(points),2)) %>%
distinct() %>%
paste( " average points where 400m >= 50 secs")
)
## [,1]
## [1,] "8120.48 average points where 400m < 50 secs"
## [2,] "7727.17 average points where 400m >= 50 secs"
The data is in files boing-boing-candy-2015.xlxs, boing-boing-candy-2016.xlxs and boing-boing-candy-2017.xlxs. Bear in mind that this is trickier compared with tasks 1, 2 & 3.
source(here::here("R_scripts/read_in_candy_data.R"))
## Warning: Unknown or uninitialised column: `which_country_do_you_live_in`.
## Warning in eval(ei, envir): NAs introduced by coercion
## Warning in eval(ei, envir): NAs introduced by coercion to integer range
## Warning in eval(ei, envir): NAs introduced by coercion
## Warning in eval(ei, envir): NAs introduced by coercion
## Warning in eval(ei, envir): NAs introduced by coercion to integer range
2015 was populated using 2016 residence data and looping to populate
the larger data set.
candy_data_3 %>%
group_by(which_country_do_you_live_in) %>%
count() %>%
arrange(desc(n))
source(here::here("R_scripts/drop_cols_and_merge.R"))
candy_data